# Load libraries
library(tidyverse)
library(plotly)
library(future)
library(furrr)
library(tidybulk)
library(cluster)
library(proxy)
library(factoextra)
library(stringr)
# Load data
tt_all <- readRDS("intermediate_data/tt_all.rds")
tt_L3 <- tt_all %>%
pluck("tt", 3)
# select level of interest
LEVEL = "level_3"
# All functions
## 1 preprocess data
### 1.1 string manipulation that converts level of interest (e.g "level_5") to its ancestor level (e.g "level_4")
pre <- function(.level) {
.level %>%
str_split("_") %>%
{as.numeric(.[[1]][2])-1} %>%
paste("level", ., sep = "_")
}
### 1.2 preprocess
preprocess <- function(.data, LEVEL) {
# load data
.data %>%
tidybulk(sample, symbol, count) %>%
# filter for the cell types of interest for gene marker selection
filter(is.na(!!as.symbol(LEVEL))==F) %>%
# Imputation of missing data within each level_5
# impute_missing_abundance(~ !!as.symbol(LEVEL)) %>%
# Group by ancestor
nest(data = - !!as.symbol(pre(LEVEL))) %>%
# Eliminate genes that are present in some but all cell types
# (can be still present in a subset of samples from each cell-type)
mutate(data = map(
data,
~ .x %>%
nest(data = -c(symbol, !!as.symbol(LEVEL))) %>%
add_count(symbol) %>%
filter(n == max(n)) %>%
unnest(data)
)) %>%
# Imputation of missing data within each level_5
mutate(data = map(data, ~ .x %>% impute_missing_abundance(~ !!as.symbol(LEVEL)))) %>%
# scale count for further analysis
mutate(data=map(data, ~ .x %>%
identify_abundant(factor_of_interest = !!as.symbol(LEVEL)) %>%
scale_abundance()
))
}
## 2 contrast functions
### 2.1 pairwise comparisons
get_contrasts_from_df = function(.data, LEVEL){
.data %>%
distinct(!!as.symbol(LEVEL)) %>%
mutate(!!as.symbol(LEVEL) := paste0(LEVEL, !!as.symbol(LEVEL))) %>%
# Permute
mutate(cell_type2 := !!as.symbol(LEVEL)) %>%
expand(!!as.symbol(LEVEL), cell_type2) %>%
filter(!!as.symbol(LEVEL) != cell_type2) %>%
# Create contrasts
mutate(contrast = sprintf("%s - %s", !!as.symbol(LEVEL), cell_type2)) %>%
pull(contrast)
}
### 2.2 create a contrast vector for limma::makeContrasts() or tidybulk::test_differential abundance()
mean_contrast <- function(.data, LEVEL){
# find all cell types
cell_types <- .data %>%
distinct(!!as.symbol(LEVEL)) %>%
pull() %>%
as.vector()
# format cell_types with prefix
cell_types <- paste0(LEVEL, cell_types)
# initialise a vector called contrasts
contrasts <- 1: length(cell_types)
# create all contrasts and store them in contrasts
for(i in 1:length(cell_types) ){
background = paste(cell_types[-i], collapse = "+")
divisor = length(cell_types[-i])
contrasts[i] <- sprintf("%s-(%s)/%s", cell_types[i], background, divisor)
}
return(contrasts)
}
### 2.3 contrast with no average background
mean_contrasts2 <- function(.data, LEVEL){
# find all cell types
cell_types <- .data %>%
distinct(!!as.symbol(LEVEL)) %>%
pull() %>%
as.vector()
# format cell_types with prefix
cell_types <- paste0(LEVEL, cell_types)
# initialise a vector called contrasts
contrasts <- 1: length(cell_types)
# create all contrasts and store them in contrasts
for(i in 1: length(cell_types) ){
background = paste(cell_types[-i], collapse = "+")
contrasts[i] <- sprintf("%s-(%s)", cell_types[i], background)
}
return(contrasts)
}
## 3 marker ranking & selection
select_markers_for_each_contrast = function(.markers, sig_size){
.markers %>%
# Group by contrast. Comparisons both ways.
pivot_longer(
cols = contains("___"),
names_to = c("stats", "contrast"),
values_to = ".value",
names_sep="___"
) %>%
# Markers selection within each pair of contrast
nest(stat_df = -contrast) %>%
# Reshape inside each contrast
mutate(stat_df = map(stat_df, ~.x %>% pivot_wider(names_from = stats, values_from = .value))) %>%
# Rank
mutate(stat_df = map(stat_df, ~.x %>%
filter(FDR < 0.05 & logFC > 2) %>%
filter(logCPM > mean(logCPM)) %>%
arrange(logFC %>% desc()) %>%
slice(1:sig_size)
)) %>%
unnest(stat_df)
}
## 4 marker collection for each contrast
contrast1 <- function(tt, LEVEL){
tt %>%
# Differential transcription
mutate(markers = map(
data,
~ test_differential_abundance(.x,
~ 0 + !!as.symbol(LEVEL),
.contrasts = get_contrasts_from_df(.x, LEVEL),
action="only")
))
}
contrast2 <- function(tt, LEVEL){
tt %>%
# Differential transcription
mutate(markers = map(
data,
~ test_differential_abundance(.x,
~ 0 + !!as.symbol(LEVEL),
.contrasts = mean_contrast(.x, LEVEL),
action="only")
))
}
sig_select <- function(.contrast, LEVEL, sig_size) {
.contrast %>%
# Select markers from each contrast by rank of stats
mutate(markers = map(markers, ~ select_markers_for_each_contrast(.x, sig_size))) %>%
# Add original data info to the markers selected
mutate(markers = map2(markers, data, ~ left_join(.x, .y))) %>%
select(!!as.symbol(pre(LEVEL)), markers) %>%
unnest(markers) %>%
# make contrasts pretty
mutate(contrast_pretty = str_replace(contrast, LEVEL, "") %>% str_replace(LEVEL, ""))
}
## 5 calculate the area of confidence ellipses and the sum of their areas
ellip_area <- function(.markers, LEVEL){
# reduce dimension
PCA <- .markers %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)) %>%
reduce_dimensions(sample, symbol, count_scaled, method = "PCA", action = "add", transform = log1p)
# number of unique symbols for each cell type
real_size <- PCA %>%
nest(data=-!!as.symbol(LEVEL)) %>%
mutate(real_size=map_int(data, ~ n_distinct(.x$symbol)))
area <- PCA %>%
# remove non-numerical data to form a numerical data frame
select(!!as.symbol(LEVEL), PC1, PC2) %>%
# normalize principle component values
mutate(across(c("PC1", "PC2"), ~ .x %>% scale())) %>%
# nest by cell_type so as to calculate ellipse area for each cell type
nest(PC = - !!as.symbol(LEVEL)) %>%
# obtain covariance matrix for each cell type
mutate(cov = map(PC, ~ cov(.x))) %>%
# calculate the eigenvalues for the covariance matrix of each cell type
mutate(eigval = map(cov, ~ eigen(.x)$values)) %>%
# transformation
mutate(area = map(eigval, ~ sqrt(.x * qchisq(0.95, 2)))) %>%
# below is the actual area for each ellipse
mutate(area = map_dbl(area, ~ prod(.x)*pi)) %>%
# collect size of each cluster as factors for weights
mutate(cluster_size = map_int(PC, ~ nrow(.x))) %>%
# weight each area by the inverse of its cluster size
mutate(weighted_area = map2_dbl(area, cluster_size, ~ .x / .y))
left_join(area, real_size)
}
## 5.2 Ellipse area calculation for a series of sig_sizes
area_df_func <- function(.area_df, LEVEL){
.area_df %>%
nest(markers = - !!as.symbol(pre(LEVEL))) %>%
mutate(ellip = map(markers, ~ .x %>% ellip_area(LEVEL)))
}
## 6 Silhouette score calculation for a series of sig_sizes
sil_func <- function(.sil_df, LEVEL){
.sil_df %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(sample, symbol, count_scaled,
method = "PCA",
action = "add",
transform = log1p) %>%
# save symbols for calculating real_size while reducing replicated rows resulted from symbol
nest(data_symbol = c(symbol, count_scaled))
)) %>%
# calculate the dissimilarity matrix with PC values
mutate(distance = map(pca, ~ .x %>%
select(contains("PC")) %>%
factoextra::get_dist(method = "euclidean")
)) %>%
# calculate silhouette score
mutate(sil = map2(pca, distance,
~ silhouette(as.numeric(as.factor(`$`(.x, !!as.symbol(LEVEL)))), .y)
)) %>%
mutate(sil = map(sil, ~ .x %>% summary())) %>%
mutate(sil = map(sil, ~ .x %>%
`$`(avg.width) ))%>%
mutate(sil = unlist(sil)) %>%
# obtain the actual number of signature genes
mutate(real_size = map_int(pca, ~ .x$data_symbol %>%
map_int(~ n_distinct(.x$symbol)) %>%
unlist() %>%
unique() ))
}
# 1 Setup data frame & preprocessing
# tt_L3 <- preprocess(counts, LEVEL)
# View cell types on level_1
tt_L3 %>%
unnest(data) %>%
select(!!as.symbol(pre(LEVEL))) %>%
distinct()
## # A tibble: 5 x 1
## level_2
## <chr>
## 1 mono_derived
## 2 t_cell
## 3 granulocyte
## 4 b_cell
## 5 natural_killer
# 2 No selection of markers
tt_naive_L3 <-
tt_L3 %>%
# Scale and reduce dimensions
mutate(data = map(
data,
~ .x %>%
reduce_dimensions(method="PCA")
# reduce_dimensions(method="MDS") %>%
# reduce_dimensions(method="tSNE")
))
PCA_naive_mono <- tt_naive_L3 %>%
pluck("data", 1) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA_naive_mono
PCA_naive_t <- tt_naive_L3 %>%
pluck("data", 2) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA_naive_t
PCA_naive_granulocyte <- tt_naive_L3 %>%
pluck("data", 3) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA_naive_granulocyte
PCA_naive_b <- tt_naive_L3 %>%
pluck("data", 4) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA_naive_b
PCA_naive_NK <- tt_naive_L3 %>%
pluck("data", 5) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA_naive_NK
contrast_L3 <- tt_all %>%
pluck("contrast1", 3)
sig_size <- 20
# create a tibble that stores the confidence ellipse area output for each signature size
area_tb <-
tibble(sig_size = 1:sig_size) %>%
# slice(1) %>%
mutate(area_df = map(sig_size, ~ sig_select(contrast_L3, LEVEL, .x))) %>%
mutate(area_df = map(area_df, ~ area_df_func(.x, LEVEL)))
# rescale areas and plot total areas vs the total number of markers selected from cell types in a level
area_data <- area_tb %>%
unnest(area_df) %>%
unnest(ellip) %>%
# nest by ancestor cell type to rescale area for all sig_sizes
nest(cell_data = - !!as.symbol(pre(LEVEL))) %>%
mutate(cell_data = map(cell_data, ~ .x %>% mutate(rescaled_area = area %>% scale(center = F)))) %>%
unnest(cell_data) %>%
# nest by ancestor cell type to summarise areas for each real_size/sig_size
nest(cell_data = - !!as.symbol(pre(LEVEL))) %>%
mutate(plot_data = map(cell_data, ~ .x %>%
group_by(real_size) %>%
summarise(sig_size,
stded_sum=sum(area, na.rm = T),
wted_sum = sum(weighted_area, na.rm = T),
rescaled_sum= sum(rescaled_area, na.rm = T)) %>%
pivot_longer(ends_with("sum"), names_to='area_type', values_to="area_value")
))
Signature size selection plot for mono_derived cell:
mono_elli <- area_data %>%
pluck("plot_data", 1) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# scale_x_continuous(sec.axis = sec_axis(as.factor())) +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
mono_elli
The elbow point indicates optimal sig_size is \(5\). PCA at sig_size = \(5\):
sig_size <- 5
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_mono_elli <- PCA_level3 %>%
pluck("pca", 1) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_mono_elli
Signature size selection plot for t cell:
t_elli <- area_data %>%
pluck("plot_data", 2) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
t_elli
The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):
sig_size <- 4
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_t_elli <- PCA_level3 %>%
pluck("pca", 2) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_t_elli
Signature size selection plot for granulocyte:
granulocyte_elli <- area_data %>%
pluck("plot_data", 3) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
granulocyte_elli
The elbow point indicates optimal sig_size is \(2\). PCA at sig_size = \(2\):
sig_size <- 2
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_granulocyte <- PCA_level3 %>%
pluck("pca", 3) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_granulocyte
Signature size selection plot for b cell:
b_elli <- area_data %>%
pluck("plot_data", 4) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
b_elli
The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):
sig_size <- 4
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_b <- PCA_level3 %>%
pluck("pca", 4) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_b
Signature size selection plot for NK:
NK_elli <- area_data %>%
pluck("plot_data", 5) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
NK_elli
The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):
sig_size <- 4
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_NK <- PCA_level3 %>%
pluck("pca", 5) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_NK
sig_size <- 60
sil_tb <-
tibble(sig_size = 1:sig_size) %>%
# slice(1) %>%
mutate(sil_df = map(sig_size, ~ sig_select(contrast_L3, LEVEL, .x))) %>%
mutate(sil_df = map(sil_df, ~.x %>% sil_func(LEVEL)))
sil_data <- sil_tb %>%
unnest(sil_df) %>%
nest(plot_data = - !!as.symbol(pre(LEVEL)))
Signature size selection plot for mono_derived cell:
mono_sil <- sil_data %>%
pluck("plot_data", 1) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
mono_sil
sil_data %>%
pluck("plot_data", 1) %>%
pull(sil) %>%
which.max()
## [1] 60
The peak is reached when sig_size = \(11\). PCA at sig_size = \(11\):
sig_size <- 11
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_mono_sil <- PCA_level3 %>%
pluck("pca", 1) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_mono_sil
Signature size selection plot for t cell:
t_sil <- sil_data %>%
pluck("plot_data", 2) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
t_sil
sil_data %>%
pluck("plot_data", 2) %>%
pull(sil) %>%
which.max()
## [1] 5
The peak is reached when sig_size = \(5\). PCA at sig_size = \(5\):
sig_size <- 5
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_t_sil <- PCA_level3 %>%
pluck("pca", 2) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_t_sil
Signature size selection plot for granulocyte:
granulocyte_sil <- sil_data %>%
pluck("plot_data", 3) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
granulocyte_sil
sil_data %>%
pluck("plot_data", 3) %>%
pull(sil) %>%
which.max()
## [1] 1
The peak is reached when sig_size = \(1\). PCA at sig_size = \(1\):
sig_size <- 1
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_granulocyte_sil <- PCA_level3 %>%
pluck("pca", 3) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_granulocyte_sil
Signature size selection plot for b cell:
b_sil <- sil_data %>%
pluck("plot_data", 4) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
b_sil
sil_data %>%
pluck("plot_data", 4) %>%
pull(sil) %>%
which.max()
## [1] 4
The peak is reached when sig_size = \(4\). PCA at sig_size = \(4\):
sig_size <- 4
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_b_sil <- PCA_level3 %>%
pluck("pca", 4) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_b_sil
Signature size selection plot for NK:
NK_sil <- sil_data %>%
pluck("plot_data", 5) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
NK_sil
sil_data %>%
pluck("plot_data", 5) %>%
pull(sil) %>%
which.max()
## [1] 15
The peak is reached when sig_size = \(15\). PCA at sig_size = \(15\):
sig_size <- 15
PCA_level3 <- contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_NK_sil <- PCA_level3 %>%
pluck("pca", 5) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_NK_sil
mean_contrast_L3 <- tt_all %>%
pluck("contrast2", 3)
sig_size <- 20
# create a tibble that stores the confidence ellipse area output for each signature size
area_tb <-
tibble(sig_size = 1:sig_size) %>%
# slice(1) %>%
mutate(area_df = map(sig_size, ~ sig_select(mean_contrast_L3, LEVEL, .x))) %>%
mutate(area_df = map(area_df, ~ area_df_func(.x, LEVEL)))
# rescale areas and plot total areas vs the total number of markers selected from cell types in a level
area_data <- area_tb %>%
unnest(area_df) %>%
unnest(ellip) %>%
# nest by ancestor cell type to rescale area for all sig_sizes
nest(cell_data = - !!as.symbol(pre(LEVEL))) %>%
mutate(cell_data = map(cell_data, ~ .x %>% mutate(rescaled_area = area %>% scale(center = F)))) %>%
unnest(cell_data) %>%
# nest by ancestor cell type to summarise areas for each real_size/sig_size
nest(cell_data = - !!as.symbol(pre(LEVEL))) %>%
mutate(plot_data = map(cell_data, ~ .x %>%
group_by(real_size) %>%
summarise(sig_size,
stded_sum=sum(area, na.rm = T),
wted_sum = sum(weighted_area, na.rm = T),
rescaled_sum= sum(rescaled_area, na.rm = T)) %>%
pivot_longer(ends_with("sum"), names_to='area_type', values_to="area_value")
))
Signature size selection plot for mono_derived cell:
mono_elli <- area_data %>%
pluck("plot_data", 1) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# scale_x_continuous(sec.axis = sec_axis(as.factor())) +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
mono_elli
The elbow point indicates optimal sig_size is \(7\). PCA at sig_size = \(7\):
sig_size <- 7
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_mono_elli <- PCA_level3 %>%
pluck("pca", 1) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_mono_elli
Signature size selection plot for t cell:
t_elli <- area_data %>%
pluck("plot_data", 2) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
t_elli
The elbow point indicates optimal sig_size is \(7\). PCA at sig_size = \(7\):
sig_size <- 7
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_t_elli <- PCA_level3 %>%
pluck("pca", 2) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_t_elli
Signature size selection plot for granulocytes:
granulocyte_elli <- area_data %>%
pluck("plot_data", 3) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
granulocyte_elli
The elbow point indicates optimal sig_size is \(2\). PCA at sig_size = \(2\):
sig_size <- 2
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_granulocyte__elli <- PCA_level3 %>%
pluck("pca", 3) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_granulocyte__elli
Signature size selection plot for b cells:
b_elli <- area_data %>%
pluck("plot_data", 4) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
b_elli
The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):
sig_size <- 4
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_b__elli <- PCA_level3 %>%
pluck("pca", 4) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_b__elli
Signature size selection plot for NK:
NK_elli <- area_data %>%
pluck("plot_data", 5) %>%
ggplot(aes(real_size, area_value, colour=area_type)) +
geom_line() +
geom_point() +
# facet_grid(rows = vars(area_type), scales = "free_y")
facet_wrap(~ area_type, scales = "free_y")
NK_elli
The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):
sig_size <- 4
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca = - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_NK_elli <- PCA_level3 %>%
pluck("pca", 5) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_NK_elli
sig_size <- 60
sil_tb <-
tibble(sig_size = 1:sig_size) %>%
# slice(1) %>%
mutate(sil_df = map(sig_size, ~ sig_select(mean_contrast_L3, LEVEL, .x))) %>%
mutate(sil_df = map(sil_df, ~.x %>% sil_func(LEVEL)))
sil_data <- sil_tb %>%
unnest(sil_df) %>%
nest(plot_data = - !!as.symbol(pre(LEVEL)))
Signature size selection plot for mono_derived cell:
mono_sil <- sil_data %>%
pluck("plot_data", 1) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
mono_sil
sil_data %>%
pluck("plot_data", 1) %>%
pull(sil) %>%
which.max()
## [1] 29
The peak is reached when sig_size = \(29\). PCA at sig_size = \(29\), \ but silhouette score has only minimal improvement than that when sig_size = \(17\):
sig_size <- 17
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca= - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_mono_sil <- PCA_level3 %>%
pluck("pca", 1) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_mono_sil
Signature size selection plot for t cell:
t_sil <- sil_data %>%
pluck("plot_data", 2) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
t_sil
sil_data %>%
pluck("plot_data", 2) %>%
pull(sil) %>%
which.max()
## [1] 31
The peak is reached when sig_size = \(26\). PCA at sig_size = \(26\):
sig_size <- 26
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca= - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_t_sil <- PCA_level3 %>%
pluck("pca", 2) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_t_sil
Signature size selection plot for granulocytes:
granulocyte_sil <- sil_data %>%
pluck("plot_data", 3) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
granulocyte_sil
sil_data %>%
pluck("plot_data", 3) %>%
pull(sil) %>%
which.max()
## [1] 1
The peak is reached when sig_size = \(1\). PCA at sig_size = \(1\):
sig_size <- 1
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca= - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_granulocyte_sil <- PCA_level3 %>%
pluck("pca", 3) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_granulocyte_sil
Signature size selection plot for b cells:
b_sil <- sil_data %>%
pluck("plot_data", 4) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
b_sil
sil_data %>%
pluck("plot_data", 4) %>%
pull(sil) %>%
which.max()
## [1] 4
The peak is reached when sig_size = \(4\). PCA at sig_size = \(4\):
sig_size <- 4
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca= - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_b_sil <- PCA_level3 %>%
pluck("pca", 4) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_b_sil
Signature size selection plot for NK cells:
NK_sil <- sil_data %>%
pluck("plot_data", 5) %>%
ggplot(aes(real_size, sil)) +
geom_line() +
geom_point()
NK_sil
sil_data %>%
pluck("plot_data", 5) %>%
pull(sil) %>%
which.max()
## [1] 15
The peak is reached when sig_size = \(15\). PCA at sig_size = \(15\):
sig_size <- 15
PCA_level3 <- mean_contrast_L3 %>%
sig_select(LEVEL, sig_size) %>%
nest(pca= - !!as.symbol(pre(LEVEL))) %>%
mutate(pca = map(pca, ~ .x %>%
distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
mutate(pca = map(pca, ~ .x %>%
reduce_dimensions(
sample, symbol, count_scaled,
method = "PCA",
action="get",
transform = log1p)))
PCA3_NK_sil <- PCA_level3 %>%
pluck("pca", 5) %>%
ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) +
geom_point() +
stat_ellipse(type = 't')+
theme_bw()
PCA3_NK_sil